#importing of libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
#read the csv file to explore the dataset
df = pd.read_csv('dataset_2017_2020.csv')
#Exploratory Data Analysis or EDA
df.dtypes #check that the data types for each column is correct
customer_id int64 product_id int64 basket_id int64 loyalty object household_type object age_band object department object brand object commodity object store int64 transaction_date object price float64 days_since_first int64 dtype: object
df.shape ##check shape of data (no.of rows, no. of columns)
(77750, 13)
df.notnull().sum() ##check for null values in each columns
customer_id 77750 product_id 77750 basket_id 77750 loyalty 77750 household_type 77750 age_band 77750 department 77750 brand 77750 commodity 77750 store 77750 transaction_date 77750 price 77750 days_since_first 77750 dtype: int64
#check unique categories of loyalty for any invalid entries
np.unique(df["loyalty"])
array(['First Time Buyer', 'Loyalist', 'Promiscuous'], dtype=object)
#check unique categories of household_type for any invalid entries
np.unique(df["household_type"])
array(['1 adult with kids', '2 adults with kids', '2 adults with no kids',
'Single female', 'Single male'], dtype=object)
#check unique categories of age_band for any invalid entries
np.unique(df["age_band"])
array(['19-24', '25-34', '35-44', '45-54', '55-64', 'Greater than 65'],
dtype=object)
#check unique categories of department for any invalid entries
np.unique(df["department"])
array(['Cosmetics', 'Deli', 'Floral', 'Grocery', 'Meat', 'Nutrition',
'Pastry', 'Pharmaceutical', 'Produce', 'Salad Bar', 'Seafood'],
dtype=object)
#check for and deleting duplicates
df.drop_duplicates(keep= 'first', inplace=True)
df
| customer_id | product_id | basket_id | loyalty | household_type | age_band | department | brand | commodity | store | transaction_date | price | days_since_first | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15803 | 1131974 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | private | Baked bread/buns/rolls | 374 | 2018-10-18 00:00:00 | 0.99 | 0 |
| 1 | 15803 | 1051516 | 57266 | Loyalist | 1 adult with kids | 19-24 | Produce | national | Vegetables - all others | 374 | 2018-10-18 00:00:00 | 0.70 | 0 |
| 2 | 15803 | 967254 | 57266 | Loyalist | 1 adult with kids | 19-24 | Pharmaceutical | national | Cold and flu | 374 | 2018-10-18 00:00:00 | 1.68 | 0 |
| 3 | 15803 | 1134222 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | private | Paper housewares | 374 | 2018-10-18 00:00:00 | 2.59 | 0 |
| 4 | 15803 | 1003421 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | national | Soup | 374 | 2018-10-18 00:00:00 | 0.60 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 77745 | 36987 | 860776 | 80984 | Promiscuous | 2 adults with kids | 45-54 | Produce | national | Vegetables - all others | 374 | 2018-07-19 00:00:00 | 0.79 | 0 |
| 77746 | 36987 | 1060673 | 80984 | Promiscuous | 2 adults with kids | 45-54 | Meat | national | Lunch meat | 374 | 2018-07-19 00:00:00 | 3.00 | 0 |
| 77747 | 36987 | 973374 | 80984 | Promiscuous | 2 adults with kids | 45-54 | Produce | private | Onions | 374 | 2018-07-19 00:00:00 | 1.99 | 0 |
| 77748 | 36987 | 13506200 | 80984 | Promiscuous | 2 adults with kids | 45-54 | Meat | national | Beef | 374 | 2018-07-19 00:00:00 | 4.92 | 0 |
| 77749 | 36987 | 16097722 | 80984 | Promiscuous | 2 adults with kids | 45-54 | Pastry | national | Cakes | 374 | 2018-07-19 00:00:00 | 1.79 | 0 |
76117 rows × 13 columns
#replacing Promiscuous to Non-Loyalist for
df.loyalty.replace("Promiscuous", "Non-Loyalist", inplace=True)
np.unique(df["loyalty"])
array(['First Time Buyer', 'Loyalist', 'Non-Loyalist'], dtype=object)
#changing transaction date format to datetime_format
df.transaction_date = pd.to_datetime(df.transaction_date,format='%Y-%m-%d')
df["year"] = df.transaction_date.dt.year
df["month"] = df.transaction_date.dt.month
df.head()
| customer_id | product_id | basket_id | loyalty | household_type | age_band | department | brand | commodity | store | transaction_date | price | days_since_first | year | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15803 | 1131974 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | private | Baked bread/buns/rolls | 374 | 2018-10-18 | 0.99 | 0 | 2018 | 10 |
| 1 | 15803 | 1051516 | 57266 | Loyalist | 1 adult with kids | 19-24 | Produce | national | Vegetables - all others | 374 | 2018-10-18 | 0.70 | 0 | 2018 | 10 |
| 2 | 15803 | 967254 | 57266 | Loyalist | 1 adult with kids | 19-24 | Pharmaceutical | national | Cold and flu | 374 | 2018-10-18 | 1.68 | 0 | 2018 | 10 |
| 3 | 15803 | 1134222 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | private | Paper housewares | 374 | 2018-10-18 | 2.59 | 0 | 2018 | 10 |
| 4 | 15803 | 1003421 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | national | Soup | 374 | 2018-10-18 | 0.60 | 0 | 2018 | 10 |
#Visualize Revenue by Customer Loyalty over time
plt.figure(figsize=(12,8))
tmp1=df.groupby(["loyalty","transaction_date"]).agg(revenue=('price', sum)).reset_index()
sns.lineplot(data=tmp1, x='transaction_date', y='revenue', hue='loyalty')
plt.xlabel('Date',fontsize='18')
plt.ylabel('Revenue',fontsize='18')
plt.title("Revenue by Customer Loyalty", fontsize='24')
plt.legend(fontsize=11)
#Using the method date_range to customize the number of dates in our x-axis.
x_labels = pd.date_range(tmp1.transaction_date.min(),tmp1.transaction_date.max(),6)
plt.xticks(x_labels,fontsize='14')
plt.yticks(fontsize='14')
plt.savefig("Revenue Per Customer Loyalty")
#showing top 5 commodities purchased by customers
tmp2 = df.groupby(['loyalty', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5 = pd.concat(
[tmp2[tmp2.loyalty == hh] \
.sort_values('total_revenue', ascending=False) \
.head(5) for hh in tmp2.loyalty.unique()]).reset_index(drop=True)
top_5
| loyalty | commodity | total_revenue | |
|---|---|---|---|
| 0 | First Time Buyer | Beef | 212.82 |
| 1 | First Time Buyer | Lunch meat | 93.46 |
| 2 | First Time Buyer | Frozen meat | 91.97 |
| 3 | First Time Buyer | Seafood-frozen | 84.54 |
| 4 | First Time Buyer | Chicken | 75.67 |
| 5 | Loyalist | Beef | 6678.30 |
| 6 | Loyalist | Cheese | 2491.30 |
| 7 | Loyalist | Frozen meat | 2408.44 |
| 8 | Loyalist | Salad | 2347.25 |
| 9 | Loyalist | Seafood-frozen | 2314.53 |
| 10 | Non-Loyalist | Beef | 9889.91 |
| 11 | Non-Loyalist | Cheese | 3718.07 |
| 12 | Non-Loyalist | Frozen meat | 3651.59 |
| 13 | Non-Loyalist | Deli meats | 3385.15 |
| 14 | Non-Loyalist | Salad | 3157.32 |
#plotting interactive graph for Top Commodities Revenue per Loyalty
data = []
for d in top_5.loyalty.unique():
tmp2 = top_5[top_5.loyalty==d].groupby(['commodity']).agg(revenue=('total_revenue', sum)).reset_index()
data.append(go.Bar(x=tmp2.commodity, y=tmp2.revenue, name = d))
fig = go.Figure(
data = data,
layout = go.Layout(
title ='Top Commodities Revenue per Loyalty',
yaxis=dict(
title='Revenue'
)
)
)
fig.update_layout(barmode='stack',xaxis_tickangle=-45)
fig.show()
tmp_stats = df.groupby(['loyalty', 'basket_id']).agg(total_revenue=('price', sum)).reset_index() tmp_stats.groupby('loyalty').agg( avg_basket_amount=('total_revenue', 'mean'), mdn_basket_amount=('total_revenue', 'median'), total_basket_amount=('total_revenue', 'sum'), num_baskets=('loyalty', 'count') )
#creating a new dataframe for beef via subsetting
beef = df[df.commodity == 'Beef'].copy()
beef.transaction_date = pd.to_datetime(beef.transaction_date, format='%Y-%m-%d')
beef['year'] = beef.transaction_date.dt.year
beef['month'] = beef.transaction_date.dt.month
#grouping dataframe by year and showing total revenue
tmp_beef = beef.groupby('year').agg(total_revenue=('price', sum)).reset_index()
#indicating it is a scatter plot
data_beef = go.Scatter(x=tmp_beef.year, y=tmp_beef.total_revenue)
fig2 = go.Figure(
data=data_beef,
layout = go.Layout(
title ='Beef consumption trend',
xaxis=dict(title='Year'),
yaxis=dict(
title='Revenue'
)
)
)
fig2.update_xaxes(ticks="inside",ticktext=["2017","Mid-2017","2018","Mid-2018","2019","Mid-2019","2020"],
tickvals=["2017","2,017.5","2018","2,018.5","2019","2,019.5","2020"])
fig2.show()
meat = df[df.commodity.str.lower().str.contains('meat|beef|chicken|seafood|pork')].copy() meat.transaction_date = pd.to_datetime(meat.transaction_date, format='%Y-%m-%d') meat['year'] = meat.transaction_date.dt.year meat['month'] = meat.transaction_date.dt.month meat.groupby(['year', 'month']).agg(total_revenue=('price', sum)).plot(figsize=(12,5));
#creating a new dataframe for meat related products
meat = df[df.commodity.str.lower().str.contains('meat|beef|chicken|seafood|pork')].copy()
meat.transaction_date = pd.to_datetime(meat.transaction_date, format='%Y-%m-%d')
tmp2=meat.groupby(['transaction_date']).agg(total_revenue=('price', sum)).reset_index()
#plotting graph to show revenue of meat related products over time
plt.figure(figsize=(12,8))
sns.lineplot(data=tmp2, x='transaction_date', y='total_revenue')
plt.xlabel('Date',fontsize='18')
plt.ylabel('Revenue',fontsize='18')
plt.title("Meat Revenue", fontsize='24')
#Using the method date_range to customize the number of dates in our x-axis.
x_labels = pd.date_range(tmp1.transaction_date.min(),tmp1.transaction_date.max(),6)
plt.xticks(x_labels,fontsize='14')
plt.yticks(fontsize='14')
plt.savefig("Meat Revenue")